#!/usr/bin/env perl
use strict;
use warnings;
use threads;
use Thread::Queue;
use File::Basename;
use FindBin;
use Pod::Usage;

my $version = "v1.1";
# customized parameters
my $seq_path = ""; #specify the sequence file
my $harvest_para = "-minlenltr 100 -maxlenltr 7000 -mintsd 4 -maxtsd 6 -motif TGCA -motifmis 1 -similar 85 -vic 10 -seed 20 -seqids yes"; #specify LTR_HARVEST parameters.
my $size_of_each_piece = 5000000; #5 Mb per piece
my $timeout = 1500; #set maximum time for a thread to run. After $timeout seconds, the child thread is killed.
my $try1 = 1; #1, further split to 50 Kb regions if thread killed by timeout. 0, no further split.
my $next = 0; #0, run LTRharvest. 1, only summarize results if the .harvest folder is retained.
my $verbose = 0; #0, remove the .harvest folder after finished; 1, retain the folder for later use.
my $threads = 4; #specify thread number to run LTRharvest
my $annotator = "LTR_HARVEST_parallel"; #the annotator name used in gff output

# dependencies
my $script_path = $FindBin::Bin;
my $cut = "$script_path/bin/cut.pl"; #the program to cut sequence
my $genometools = ""; #the path to genometools
my $check_dependencies = undef;
my $help = undef;

my $usage = "
~ ~ ~ Run LTR_HARVEST in parallel ~ ~ ~

Modified from \@wild-joker by Shujun Ou (shujun.ou.1\@gmail.com)
Date: 10/16/2020
Update: 10/17/2020
Version: $version

Usage: perl LTR_HARVEST_parallel -seq [file] -size [int] -threads [int]
Options:	-seq	[file]	Specify the sequence file.
		-size	[int]	Specify the size you want to split the genome sequence.
				Please make it large enough to avoid spliting too many LTR elements. Default 5000000 (bp)
		-time	[int]	Specify the maximum time to run a subregion (a thread).
				This helps to skip simple repeat regions that take a substantial of time to run. Default: 1500 (seconds).
				Suggestion: 300 for -size 1000000. Increase -time when -size increased.
		-try1	[0|1]	If a region requires more time than the specified -time (timeout), decide:
					0, discard the entire region.
					1, further split to 50 Kb regions to salvage LTR candidates (default);
		-next		Only summarize the results for previous jobs without rerunning LTR_HARVEST (for -v).
		-verbose|-v	Retain LTR_HARVEST outputs for each sequence piece.
		-gt [file]  	The path to the program GenomeTools (LTR_HARVEST included)
		-threads|-t	[int]	Indicate how many CPU/threads you want to run LTR_HARVEST.
		-check_dependencies Check if dependencies are fullfiled and quit
		-help|-h	Display this help information.
\n";


# read user parameters
my $i=0;
foreach (@ARGV){
	$seq_path = $ARGV[$i+1] if /^-seq$/i;
	$size_of_each_piece = $ARGV[$i+1] if /^-size$/i;
	$timeout = $ARGV[$i+1] if /^-time$/i;
	$try1 = $ARGV[$i+1] if /^-try1$/i;
	$next = 1 if /^-next$/i;
	$verbose = 1 if /^-verbose$|^-v$/i;
	$cut = $ARGV[$i+1] if /^-cut$/i;
	$genometools = $ARGV[$i+1] if /^-gt$/i;
	$threads = $ARGV[$i+1] if /^-threads$|^-t$/i;
	$check_dependencies = 1 if /^-check_dependencies$/i;
	$help = 1 if /^-help$|^-h$/i;
	$i++;
	}

# check parameters
if ($help) {
    pod2usage( { -verbose => 0,
                 -exitval => 0,
                 -message => "$usage\n" } );
}

if ( ! $seq_path and (! $check_dependencies) ){
    pod2usage( {
           -message => "At least 1 parameter mandatory:\n1) Input fasta file: -seq\n".
           "$usage\n\n",
           -verbose => 0,
           -exitval => 2 } );
}
print "Pass!\n";
exit if $check_dependencies;

# get GenomeTools if undefined
$genometools=`which gt 2>/dev/null` if $genometools eq '';
$genometools =~ s/\s+$//;
$genometools = dirname($genometools) unless -d $genometools;
$genometools="$genometools/" if $genometools ne '' and $genometools !~ /\/$/;

die "LTR_HARVEST is not exist in the path $genometools!\n" unless -X "${genometools}gt";
print "\nUsing this LTR_HARVEST: ${genometools}gt\n"; #test

# make a softlink to the genome file
my $seq_file = basename($seq_path);
`ln -s $seq_path $seq_file` unless -s $seq_file;

if ($threads == 1){
# run the original single threaded code
`${genometools}gt suffixerator -db $seq_file -indexname $seq_file -tis -suf -lcp -des -ssp -sds -dna 2>/dev/null`;
`${genometools}gt ltrharvest -index $seq_file $harvest_para > $seq_file.harvest.combine.scn 2>/dev/null`;
`rm $seq_file.des $seq_file.esq $seq_file.lcp $seq_file.llv $seq_file.md5 $seq_file.prj $seq_file.sds $seq_file.ssp $seq_file.suf 2>/dev/null`;

} else {
## run the paralleled code
# read genome in memory
open SEQ, "<$seq_file" or die $usage;
open GFF, ">$seq_file.harvest.combine.gff3" or die $!;
print GFF "##gff-version   3\n";
my %seq; #a hash to store seq name and length info
my %order; #store seq order in genome
my $chr_info; #store seq id in genome
$i=0;
$/ = "\n>";
while (<SEQ>){
	s/>//g;
	my ($id, $seq) = (split /\n/, $_, 2);
	$seq =~ s/\s+//g;
	my $len = length $seq;
	print GFF "##sequence-region   $id 1 $len\n";
	$chr_info.="#$id\n";
	$seq{$id} = $len;
	$order{$id} = $i;
	$i++;
	}
print GFF "$chr_info";
close SEQ;
$/="\n";

goto Next if $next == 1; #run the next step if all LTR_HARVEST processes are done

`mkdir $seq_file.harvest` unless -d "$seq_file.harvest";
chdir "$seq_file.harvest";
`perl $cut ../$seq_file -s -l $size_of_each_piece`;

##multi-threading using queue, create a worker module for parallel computation
my $process_q=Thread::Queue->new();
sub worker {
	while (my $seq = $process_q -> dequeue()){
		chomp $seq;
		$seq =~ s/\s+//;
		print localtime() ." CPU".threads -> self() -> tid().": running on $seq\n";
		`timeout ${timeout}s ${genometools}gt suffixerator -db $seq -indexname $seq -tis -suf -lcp -des -ssp -sds -dna 2>/dev/null`;
		`timeout ${timeout}s ${genometools}gt ltrharvest -index $seq $harvest_para > $seq.harvest.scn 2>/dev/null`;
		 if ($? ne 0 and $try1 ne 0){
			print localtime() ." CPU".threads -> self() -> tid().": $seq timeout, process it with the salvage mode\n";
			my $in=`perl $script_path/LTR_HARVEST_parallel -seq $seq -size 50000 -time 30 -try1 0 -threads 1 -cut $cut -gt $genometools`;
			`mv $seq.harvest.combine.scn $seq.harvest.scn`;
			`rm -rf ./${seq}.harvest 2>/dev/null`; #remove?
			}
		`rm $seq.des $seq.esq $seq.lcp $seq.llv $seq.md5 $seq.prj $seq.sds $seq.ssp $seq.suf 2>/dev/null`;
		}
	}

#insert seq names into the worker queue
open List, "<../$seq_file.list" or die $!;
$process_q -> enqueue (<List>);
$process_q -> end(); #stop adding items to the queue
close List;

#work and finish
for (1..$threads){
	threads -> create ( \&worker );
	}
foreach my $thr (threads -> list()){
	$thr -> join();
	}

chdir "../";

Next:
#combine split ltr_harvest results
open Out, ">$seq_file.harvest.combine.scn" or die $!;

#print out headers
print Out "#LTR_HARVEST_parallel -seq $seq_file -size $size_of_each_piece -time $timeout -try1 $try1 -threads $threads -cut $cut
# LTR_HARVEST args= -minlenltr 100 -maxlenltr 7000 -mintsd 4 -maxtsd 6 -motif TGCA -motifmis 1 -similar 85 -vic 10 -seed 20 -seqids yes
# LTR_HARVEST_parallel version=$version
# predictions are reported in the following way
# s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr chr
# where:
# s = starting position
# e = ending position
# l = length
# ret = LTR-retrotransposon
# lLTR = left LTR
# rLTR = right LTR
# sim = similarity
# seq-nr = sequence order\n";

open List, "<$seq_file.list" or die $!;
my $count = 1; #count repeats in gff
foreach my $seq (<List>){
	$seq =~ s/\s+//;
	my ($base, $order) = ($1, $2) if $seq =~ /(.*)_sub([0-9]+)$/;
	my $coord_adj = ($order - 1) * $size_of_each_piece;
	next unless -e "$seq_file.harvest/$seq.harvest.scn";
	open Scn, "<$seq_file.harvest/$seq.harvest.scn" or die $!;
	my $len;
	open IN, "<$seq_file.harvest/$seq";
	while(<IN>) {chomp;next if (/^>/);$len.=$_}
	$len=length($len);close IN;
	while (<Scn>){
		next if /^#/;
		chomp;
		my ($start, $end, $ele_len, $ltrt_s, $lltr_e, $lltr, $rltr_s, $rltr_e, $rltr, $sim, $seq_id ) = split/\s+/;
		my @coord = ($start, $end, $ltrt_s, $lltr_e, $rltr_s, $rltr_e);
		my @coord_re;
		#convert coordinates back to the genome scale
		my $i=-1;
		foreach (@coord) {
			$i++;
			my $val=$coord[$i];
			if ($seq_id == 0) {	$val = $val + $coord_adj } else {
				if ($i % 2==0) {$val=$coord_adj + $len - $coord[$i+1]} else {$val=$coord_adj + $len - $coord[$i-1]}
			}
			push(@coord_re,$val);
			}
		($start, $end, $ltrt_s, $lltr_e, $rltr_s, $rltr_e)=@coord_re;
		print Out "$start $end $ele_len $ltrt_s $lltr_e $lltr $rltr_s $rltr_e $rltr $sim $order{$base} $base\n";

		#print GFF format
		my $chr = $base;
		my $strand="+";
		if ($seq_id == 1) {$strand="-"}
		print GFF "$chr\t$annotator\trepeat_region\t$start\t$end\t.\t$strand\t.\tID=repeat_region$count\n";
		#print GFF "$chr\t$annotator\ttarget_site_duplication\t$lTSD\t.\t$strand\t.\tParent=repeat_region$count\n" unless $TSD eq "NA";
		print GFF "$chr\t$annotator\tLTR_retrotransposon\t$start\t$end\t.\t$strand\t.\tID=LTR_retrotransposon$count;Parent=repeat_region$count;ltr_identity=$sim;seq_number=$order{$chr}\n";
		print GFF "$chr\t$annotator\tlong_terminal_repeat\t$start\t$lltr_e\t.\t$strand\t.\tParent=LTR_retrotransposon$count\n";
		print GFF "$chr\t$annotator\tlong_terminal_repeat\t$rltr_s\t$end\t.\t$strand\t.\tParent=LTR_retrotransposon$count\n";
		#print GFF "$chr\t$annotator\ttarget_site_duplication\t$rTSD\t.\t$strand\t.\tParent=repeat_region$count\n" unless $TSD eq "NA";
		print GFF "###\n";

		}
	$count++;
	close Scn;
	}
close List;
close GFF;
`rm -rf ./$seq_file.harvest` if $verbose eq 0;
}
print localtime() ." Job finished! Check out $seq_file.harvest.combine.scn\n";
